MLS Salary Reproducible Analysis

Sample analysis

Tom Worville



In [1]:

    
import os
import glob
import pandas as pd



In [2]:

    
URL = 'https://github.com/data-is-plural/mls-salaries/raw/master/csvs/mls-salaries-'



In [3]:

    
years = range(2007,2017)



In [4]:

    
from urllib import urlretrieve
for year in years:
    year = str(year)
    urlretrieve((URL + str(year) + ".csv"), (year + '_salaries.csv'))



In [5]:

    
path = os.getcwd()
all_files = glob.glob(os.path.join(path, "*.csv"))



In [6]:

    
data = pd.DataFrame()
for csv in all_files:
    for year in years:
        if str(year) in str(csv):
            print year
            season = pd.read_csv(csv)
            season['season'] = year
            season['player'] = season['first_name'] + " " + season['last_name']
            data = pd.concat([data, season])



In [7]:

    
data.head()









    Out[7]:






  
    
      
      club
      last_name
      first_name
      position
      base_salary
      guaranteed_compensation
      season
      player
    
  
  
    
      0
      CHI
      Armas
      Chris
      M
      225000.0
      225000.0
      2007
      Chris Armas
    
    
      1
      CHI
      Banner
      Michael
      M
      12900.0
      12900.0
      2007
      Michael Banner
    
    
      2
      CHI
      Barrett
      Chad
      F
      41212.5
      48712.5
      2007
      Chad Barrett
    
    
      3
      CHI
      Blanco
      Cuauhtemoc
      F
      2492316.0
      2666778.0
      2007
      Cuauhtemoc Blanco
    
    
      4
      CHI
      Brown
      C.J.
      D
      106391.0
      106391.0
      2007
      C.J. Brown



In [8]:

    
%matplotlib inline



In [9]:

    
import matplotlib.pyplot as plt
plt.style.use('ggplot')



In [10]:

    
data.groupby(data.season).median().plot(legend = False);



In [11]:

    
pivoted = data.pivot_table('base_salary',index = data.season, columns = data.player, aggfunc='sum')
pivoted.iloc[:5, :5]









    Out[11]:






  
    
      player
      AJ Cochran
      AJ DeLaGarza
      AJ Soares
      Aaron Guillen
      Aaron Hohlbein
    
    
      season
      
      
      
      
      
    
  
  
    
      2007
      NaN
      NaN
      NaN
      NaN
      30000.0
    
    
      2008
      NaN
      NaN
      NaN
      NaN
      33000.0
    
    
      2009
      NaN
      36000.0
      NaN
      NaN
      34650.0
    
    
      2010
      NaN
      45100.0
      NaN
      NaN
      40000.0
    
    
      2011
      NaN
      55100.0
      42000.0
      NaN
      NaN



In [12]:

    
ax = pivoted.plot(legend = False, alpha = 0.05);
ax.set_ylim(0, 500000);



In [ ]:

	club	last_name	first_name	position	base_salary	guaranteed_compensation	season	player
0	CHI	Armas	Chris	M	225000.0	225000.0	2007	Chris Armas
1	CHI	Banner	Michael	M	12900.0	12900.0	2007	Michael Banner
2	CHI	Barrett	Chad	F	41212.5	48712.5	2007	Chad Barrett
3	CHI	Blanco	Cuauhtemoc	F	2492316.0	2666778.0	2007	Cuauhtemoc Blanco
4	CHI	Brown	C.J.	D	106391.0	106391.0	2007	C.J. Brown

player	AJ Cochran	AJ DeLaGarza	AJ Soares	Aaron Guillen	Aaron Hohlbein
season
2007	NaN	NaN	NaN	NaN	30000.0
2008	NaN	NaN	NaN	NaN	33000.0
2009	NaN	36000.0	NaN	NaN	34650.0
2010	NaN	45100.0	NaN	NaN	40000.0
2011	NaN	55100.0	42000.0	NaN	NaN